load packages

library(ggplot2)
library(pheatmap)
library(factoextra)
library(EnhancedVolcano)
library(htmlwidgets)
library(plotly)
set.seed(123)

Install & load DESeq2 and Biobase (cannot be installed with conda on a windows machine)

if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("DESeq2")

BiocManager::install("Biobase")
library(DESeq2)
library(Biobase)

set working directory to where the you have the github reopsitory files

setwd("~/<path>/bioinfo_exc")

read count data

counts <- read.table("data/counts.txt", sep = "\t", header = TRUE, row.names = 1)

have a look

head(counts, 2)
##                 SRR1146076 SRR1146077 SRR1146078 SRR1146079 SRR1146080
## ENSG00000000003        605        372        118        327        515
## ENSG00000000005        254          7         33         10         60
##                 SRR1146081 SRR1146082 SRR1146083 SRR1146084 SRR1146085
## ENSG00000000003        456        532        611        721        913
## ENSG00000000005         19         76         10         44         52
##                 SRR1146086 SRR1146087 SRR1146088 SRR1146089 SRR1146090
## ENSG00000000003        957        687        693        668        776
## ENSG00000000005         82        108        138         22         97
##                 SRR1146091 SRR1146092 SRR1146093 SRR1146094 SRR1146095
## ENSG00000000003        746        357        606        684        372
## ENSG00000000005         64          4         20        133         27
##                 SRR1146096 SRR1146097 SRR1146098 SRR1146099 SRR1146100
## ENSG00000000003       2191        473        570        735        586
## ENSG00000000005        106         13         10         16         86
##                 SRR1146101 SRR1146102 SRR1146103 SRR1146104 SRR1146105
## ENSG00000000003        705        729        585        503        794
## ENSG00000000005        153         30         26         37         78
##                 SRR1146106 SRR1146107 SRR1146108 SRR1146109 SRR1146110
## ENSG00000000003        667        575       1206        753        418
## ENSG00000000005        256         60        142        148         28
##                 SRR1146111 SRR1146112 SRR1146113 SRR1146114 SRR1146115
## ENSG00000000003        789        487        777        920        673
## ENSG00000000005        311         31        428         98        133
##                 SRR1146116 SRR1146117 SRR1146118 SRR1146119 SRR1146120
## ENSG00000000003       1084        602       1873        714        873
## ENSG00000000005        185         91        121        101        151
##                 SRR1146121 SRR1146122 SRR1146123 SRR1146124 SRR1146125
## ENSG00000000003        641       1027        606        642        660
## ENSG00000000005         42         78         41         16        105
##                 SRR1146126 SRR1146127 SRR1146128 SRR1146129 SRR1146130
## ENSG00000000003        780       1094        443        518        866
## ENSG00000000005        103        112         38         75        244
##                 SRR1146131 SRR1146132 SRR1146133 SRR1146134 SRR1146135
## ENSG00000000003       1714        678        654        826        604
## ENSG00000000005        205         60        163         87         79
##                 SRR1146136 SRR1146137 SRR1146138 SRR1146139 SRR1146140
## ENSG00000000003        628        668        800        898        672
## ENSG00000000005         39        193        162        182         36
##                 SRR1146141 SRR1146142 SRR1146143 SRR1146144 SRR1146145
## ENSG00000000003        555       1014        915        849        824
## ENSG00000000005         27        201         89         94         72
##                 SRR1146146 SRR1146147 SRR1146148 SRR1146149 SRR1146150
## ENSG00000000003        511        520        432        562        628
## ENSG00000000005         38         56         33         54         41
##                 SRR1146151 SRR1146152 SRR1146153 SRR1146154 SRR1146155
## ENSG00000000003        575        383        723        449        815
## ENSG00000000005        175         82        100        110         58
##                 SRR1146156 SRR1146157 SRR1146158 SRR1146159 SRR1146160
## ENSG00000000003        423        538        498        484        441
## ENSG00000000005         15         20         51         38         13
##                 SRR1146161 SRR1146162 SRR1146163 SRR1146164 SRR1146165
## ENSG00000000003        483        415        545        381        464
## ENSG00000000005         61         31         17          5         15
##                 SRR1146166 SRR1146167 SRR1146168 SRR1146169 SRR1146170
## ENSG00000000003       1263        552        604        769        707
## ENSG00000000005        105        102         12        172         96
##                 SRR1146171 SRR1146172 SRR1146173 SRR1146174 SRR1146175
## ENSG00000000003       1312        724        913        988        895
## ENSG00000000005        172        151        106         98        155
##                 SRR1146176 SRR1146177 SRR1146178 SRR1146179 SRR1146180
## ENSG00000000003       1615        855        560        873        832
## ENSG00000000005        171        104        150         54         90
##                 SRR1146181 SRR1146182 SRR1146183 SRR1146184 SRR1146185
## ENSG00000000003        732       1086        739        432        538
## ENSG00000000005         79         42        128         82        121
##                 SRR1146186 SRR1146187 SRR1146188 SRR1146189 SRR1146190
## ENSG00000000003        507        763        986        806        544
## ENSG00000000005         25        226        165        195        110
##                 SRR1146191 SRR1146192 SRR1146193 SRR1146194 SRR1146195
## ENSG00000000003       1002        579        934        961        464
## ENSG00000000005        114        119         71         68         59
##                 SRR1146196 SRR1146197 SRR1146198 SRR1146199 SRR1146200
## ENSG00000000003        630        559        523        561       1197
## ENSG00000000005         81         72         47         63        102
##                 SRR1146201 SRR1146202 SRR1146203 SRR1146204 SRR1146205
## ENSG00000000003        595        798        548        500        501
## ENSG00000000005        253         40         38         32         56
##                 SRR1146206 SRR1146207 SRR1146208 SRR1146209 SRR1146210
## ENSG00000000003        625        906        799        427        576
## ENSG00000000005        172         29         24         43         37
##                 SRR1146211 SRR1146212 SRR1146213 SRR1146214 SRR1146215
## ENSG00000000003        412        637        577        553        474
## ENSG00000000005         62         39         72         15         38
##                 SRR1146216 SRR1146217 SRR1146218 SRR1146219 SRR1146220
## ENSG00000000003        747        510        486        425        536
## ENSG00000000005        157         27         42         19         26
##                 SRR1146221 SRR1146222 SRR1146223 SRR1146224 SRR1146225
## ENSG00000000003        628        388        378        862        508
## ENSG00000000005         46          9         13         51         16
##                 SRR1146226 SRR1146227 SRR1146228 SRR1146229 SRR1146230
## ENSG00000000003        538        548        932        590        522
## ENSG00000000005         38         28         48         26         39
##                 SRR1146231 SRR1146232 SRR1146233 SRR1146234 SRR1146235
## ENSG00000000003        954       1099        473        558        591
## ENSG00000000005         89         91         17          6         21
##                 SRR1146236 SRR1146237 SRR1146238 SRR1146239 SRR1146240
## ENSG00000000003        489        455       1058        621        243
## ENSG00000000005         71          3         61          9         27
##                 SRR1146241 SRR1146242 SRR1146243 SRR1146244 SRR1146245
## ENSG00000000003        248        784        510        546        744
## ENSG00000000005          5         11         80         25         55
##                 SRR1146246 SRR1146247 SRR1146248 SRR1146249 SRR1146250
## ENSG00000000003        901        829        526        426        513
## ENSG00000000005         57         51         28         66         55
##                 SRR1146252 SRR1146253 SRR1146254
## ENSG00000000003        464        421        619
## ENSG00000000005         45         13         37

are there missing values?

sum(is.na(counts))
## [1] 0

how many samples/genes?

dim(counts)
## [1] 57992   178

calculate count per million (CPM) in order to normalize counts by library size

cpm <- apply(counts,2, function(x) (x/sum(x))*1e6)

save count as RDS file

saveRDS(cpm, file = "noraml_lesion_CPM.rds")

read sample annotations

annot_samples <- read.table("data/sample-annotation.txt", sep = "\t", header = T, row.names = 1)

have a look

head(annot_samples, 2)
##                type
## SRR1146076   normal
## SRR1146077 lesional
summary(annot_samples)
##      type          
##  Length:178        
##  Class :character  
##  Mode  :character

how many samples in each type?

table(annot_samples$type)
## 
## lesional   normal 
##       95       83

how many samples? (are the same as number of column in counts?)

dim(annot_samples)[1]
## [1] 178

are the names the same in counts and annotation?

sum(colnames(counts) %in% rownames(annot_samples))
## [1] 178

make a vector of sample_id for each condition (normal/lesional)

samples_normal <- rownames(annot_samples)[annot_samples$type == 'normal']
samples_lesional <- rownames(annot_samples)[annot_samples$type == 'lesional']

Filter low expressing genes

for each condition mark genes that have CPM > 1 in at least 75% of samples

genes that pass the filter in the normal group

genes2keep_normal <- names(which(rowSums(cpm[, samples_normal] > 1) > length(samples_normal)*0.75))

how many are there?

length(genes2keep_normal) 
## [1] 17880

genes that pass the filter in the lesional group

genes2keep_lesional <- names(which(rowSums(cpm[, samples_lesional] > 1) > length(samples_lesional)*0.75))

how many are there?

length(genes2keep_lesional) 
## [1] 17248

combine the genes names and remove duplications

genes2keep <- unique(c(genes2keep_normal, genes2keep_lesional))

subset the count table according to the filtering condition

cpm <- cpm[genes2keep ,]

how many genes we have left?

dim(cpm)[1]
## [1] 18286

save filtered count as RDS file

saveRDS(cpm, file = "noraml_lesional_CPM_filtered.rds")

check the samples integrety and variation

transform data to log2CPM (add 1 to aviod taking log of 0)

log2cpm <- log2(cpm + 1)

plot log2(cpm) distribution for each sample in normal group

plot(density(log2cpm[,samples_normal[1]]), main="Distribution of log2cpm for each sample in 'Noraml' group")

for (i in samples_normal[2:length(samples_normal)]){
  lines(density(log2cpm[, i]))
}

the log2cpm distribution of each sample looks more or less similar in the normal group

plot log2cpm distribution for each sample in lesional group

plot(density(log2cpm[,samples_lesional[1]]), main="Distribution of log2cpm for each sample in 'Lesional' group")
for (i in samples_lesional[2:length(samples_lesional)]){
  lines(density(log2cpm[, i]))
}

from the plot its looks like one sample values distribute different than the rest

plot samples mean for the lesional group

hist((colMeans(log2cpm[,samples_lesional])), main="Mean log2cpm for Lesional group samples")

who is the sample with low mean?

which((colMeans(log2cpm[,samples_lesional])) < 3.5)
## SRR1146078 
##          2

sample SRR1146078 has values that are a bit lower than most of samples in the lesion group

change cpm matrix to dataframe for convience and look at the distribution of the different sample

log2cpm <- as.data.frame(log2cpm)
plot(density(log2cpm$SRR1146078), "log2pcm distribution in SRR1146078")

it is not very crutial because the shape of the distribution looks the same and there are many other samples in that group, but I will remove this sample (I will still have 94 samples left)

keep = !"SRR1146078" == names(log2cpm)
log2cpm <- log2cpm[, keep]

also remove tha sample from the annotation table

#copy row names to new column (otherwise they will be lost)
annot_samples$names <- rownames(annot_samples)

# filter rows 
keep <- !(rownames(annot_samples) == "SRR1146078") 
annot_samples <- annot_samples[keep,]

# remove 'names' column
annot_samples <- data.frame(type = annot_samples[, c("type")], row.names = annot_samples$names)

use top variable/expressed genes to measure variantion between samples

Compute the mean and variance for each gene

means <- rowMeans(log2cpm)
vars <- apply(log2cpm,1,var)

Sort, select and plot the top 500 highly variable genes from the data

vars <- sort( vars , decreasing=T)
top_var <- names(vars) [1:500]

Sort, select and plot the top 500 highly expressed genes from the data

means <- sort( means , decreasing=T)
top_means <- names(means) [1:500]

plot top 20 variable and expressed genes

par(mfrow=c(1,2),mar = c(5,3,2,1))
boxplot(t(log2cpm[ top_var[1:20],]),  las=2 , col="grey" , main="log2CPM (top var 20 genes)" ,cex=.2)
boxplot(t(log2cpm[ top_means[1:20],]),  las=2 , col="grey" , main="log2CPM (top express 20 genes)" ,cex=.2)

how many in both groups

sum(top_means %in% top_var)
## [1] 32
sum(top_var %in% top_means)
## [1] 32

PCA analysis of log2CPM data

PC of top 500 most vairable genes

PC <-  prcomp(t(log2cpm[top_var ,]), center = TRUE, scale. = TRUE)

# PC1 Vs PC2
plot(PC$x[,1] , PC$x[,2], cex=2, col=factor(annot_samples$type), xlab="PC1", ylab="PC2", 
     pch=16, main="PCA of top 500 most varaible genes", las=1)

# add sample labels (optional)
text(PC$x[,1] , PC$x[,2], cex=.7, labels = paste0(rownames(annot_samples)), pos = 3)

#PC3 Vs PC4
plot(PC$x[,3] , PC$x[,4], cex=2, col=factor(annot_samples$type), xlab="PC3", ylab="PC4",
     pch=16, main="PCA of top 500 most varaible genes", las=1)
text(PC$x[,3] , PC$x[,4], cex=.7, labels = paste0(rownames(annot_samples)), pos = 3)

from the PCA analysis we can see that PC1 is good at differentiating between the two types of samples while PC 2,3,4 do not seperate acording to type. however,its looks like there is a sample from one type with expression pattern more similar to samples from the other type (the black point among the pink ones)

chack sample grouping with top expressed genes

PC <-  prcomp(t(log2cpm[top_means ,]), center = TRUE, scale. = TRUE)

plot(PC$x[,1] , PC$x[,2], cex=2, col=factor(annot_samples$type), xlab="PC1", ylab="PC2", 
     pch=16, main="PCA of top 500 most varaible genes", las=1)

#text(PC$x[,1] , PC$x[,2], cex=.7, labels = paste0(rownames(annot_samples)), pos = 3)

plot(PC$x[,3] , PC$x[,4], cex=2, col=factor(annot_samples$type), xlab="PC3", ylab="PC4",
     pch=16, main="PCA of top 500 most varaible genes", las=1)

#text(PC$x[,3] , PC$x[,4], cex=.7, labels = paste0(rownames(annot_samples)), pos = 3)

can genearly see the same pattern but here some of the samples from the different groups are closer in PC1

use k-means to find the outlier sample

K-means on samples with top 500 expression. with k=2

top500_exp_k2 <- kmeans(t(log2cpm[top_means,]), 2)

plot the groups

fviz_cluster(list(data = t(log2cpm[top_means ,]), cluster = top500_exp_k2$cluster),
             ellipse.type = "norm", geom = "point", stand = FALSE,
             palette = "jco", ggtheme = theme_classic())

### we can see nice seperation between the groups

collect samples that were asigned to cluster 1 and 2

clust1 <- names(which(top500_exp_k2$cluster == 1))

clust2 <- names(which(top500_exp_k2$cluster == 2))

find which cluster coresponed to ‘lesional’ and to ‘normal’

lesional_samples <- rownames(annot_samples)[annot_samples$type == 'lesional']

normal_samples <- rownames(annot_samples)[annot_samples$type == 'normal']

which cluster represent which group?

sum(lesional_samples %in% clust1)  
## [1] 93
sum(normal_samples %in% clust2) 
## [1] 83

from the above I can conclude that k-mean cluster 1 is the ‘lesional’ group and the ‘normal’ group coresponed to cluster 2 (ATTENTION! the cluster annotation might be random so the cluster names (i.e. cluster1, cluster2) can switch when re-running the code

# I expect to find 1 sample from the  'lesional' group in cluster 2
sum(lesional_samples %in% clust2)
## [1] 1

so the one sample in clust2 that come from ‘lesional’ should be removed.

lesional_samples[lesional_samples %in% clust2]
## [1] "SRR1146216"

sample “SRR1146216” shold be removed!

keep = !"SRR1146216" == names(log2cpm)
log2cpm <- log2cpm[, keep]

also remove tha sample from the annotation table

keep <- !(rownames(annot_samples) == "SRR1146216") 

annot_samples <- annot_samples[keep,]

prepare table for Differential analsys with DESeq2

define ‘type’ column as factor and specify what levels is the ‘base’ (what we will compare to)

annot_samples$type <- factor(annot_samples$type, levels= c("normal" ,"lesional"))

check that the order of sample names is the same in the column of the CPM and rows of annotation table

all(colnames(log2cpm) == rownames(annot_samples))
## [1] TRUE

subset the count data based on the filtartion of CPM

counts <- counts[rownames(log2cpm), colnames(log2cpm)]

build deseq2 data set object

dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = annot_samples,
                              design = ~type)
## converting counts to integer mode

run DE analysis

dds <- DESeq(dds)

store the results table

res <- results(dds)
res
## log2 fold change (MLE): type lesional vs normal 
## Wald test p-value: type lesional vs normal 
## DataFrame with 18286 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000000003  671.8456     -0.4672663 0.0594921  -7.85425 4.02162e-15
## ENSG00000000005   74.4533     -1.3628353 0.1409134  -9.67144 3.98745e-22
## ENSG00000000419  850.1872      0.3829459 0.0512051   7.47867 7.50793e-14
## ENSG00000000457  613.6846     -0.0696658 0.0377814  -1.84392 6.51948e-02
## ENSG00000000460  282.1678      0.2476081 0.0435604   5.68424 1.31393e-08
## ...                   ...            ...       ...       ...         ...
## ENSG00000281103   79.6622       1.822028 0.1117338  16.30687 8.82002e-60
## ENSG00000282851   54.6938       0.477754 0.0862431   5.53962 3.03120e-08
## ENSG00000283253   55.0156       0.355840 0.0717942   4.95640 7.18125e-07
## ENSG00000283460   53.7003       0.777353 0.0793927   9.79123 1.22788e-22
## ENSG00000283566   54.6132       0.870658 0.0727316  11.97083 5.05183e-33
##                        padj
##                   <numeric>
## ENSG00000000003 1.08273e-14
## ENSG00000000005 1.41774e-21
## ENSG00000000419 1.90707e-13
## ENSG00000000457 7.87471e-02
## ENSG00000000460 2.57353e-08
## ...                     ...
## ENSG00000281103 1.05552e-58
## ENSG00000282851 5.82539e-08
## ENSG00000283253 1.27479e-06
## ENSG00000283460 4.47183e-22
## ENSG00000283566 2.69323e-32

read gene annotations

genes_annot <- read.table("data/gene-annotation.txt", sep="\t", header = T, quote = "", fill = F)

make a date frame with results and annotations and look at first 2 rows

res_annotated <- merge(as.data.frame(res), genes_annot, by.x=0, by.y=1, all.x=T)
head(res_annotated, 2)
##         Row.names baseMean log2FoldChange      lfcSE      stat       pvalue
## 1 ENSG00000000003 671.8456     -0.4672663 0.05949214 -7.854253 4.021621e-15
## 2 ENSG00000000005  74.4533     -1.3628353 0.14091345 -9.671436 3.987450e-22
##           padj ENTREZID SYMBOL      GENENAME
## 1 1.082735e-14     7105 TSPAN6 tetraspanin 6
## 2 1.417743e-21    64102   TNMD   tenomodulin

filter columns and write results

res_annotated <- res_annotated[, c(1,3,7,8,9,10)]

colnames(res_annotated)[1] <- "ENSMBL.ID"
write.table(res_annotated, "DESeq_results.tsv", sep = '\t', row.names = F)

select top 100 most significant and annotaed genes

res_annotated_only <- merge(as.data.frame(res), genes_annot, by.x=0, by.y=1)

order the table by p-adjusted

res_annotated_only <- res_annotated_only[order(res_annotated_only$padj) ,]

select top 100

res_annotated_only <- res_annotated_only[1:100 ,]

merge with log2cpm

top100_sig_cpm <- merge(res_annotated_only, log2cpm, by.x=1, by.y=0, all.x=T)

remove unnecessary columns

top100_sig_cpm <- top100_sig_cpm[, c(1,11:length(colnames(top100_sig_cpm)))]
rownames(top100_sig_cpm) <- top100_sig_cpm$Row.names

top100_sig_cpm <- top100_sig_cpm[, c(2:length(colnames(top100_sig_cpm)))]

plot heatmap of top 100 significant genes

z-score normalize the log2cpm

cal_z_score <- function(x){
  (x - mean(x)) / sd(x)
}

top100_sig_cpm_norm <- t(apply(top100_sig_cpm, 1, cal_z_score))

create and save heatmap

HM <- pheatmap(top100_sig_cpm_norm, annotation_col = annot_samples, show_rownames = F, show_colnames = F)
HM

pdf("Top100_sig_heatmap.pdf")
HM
dev.off()

plot volcano

check what is the minimum padj value of top 100 geens in order to color the plot with top 100 significant genes

res <- as.data.frame(res)
min_padj <- max(sort(res$padj)[1:100])

volcano plot with top 100 sig geens in red (FC cutoff 1)

vp <- EnhancedVolcano(res, x = "log2FoldChange", y = "padj", lab="", pCutoff = min_padj)
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...
vp

pdf("VolcanoPlot_top_100.pdf")
vp
dev.off()

generate interactive plot with plotly

make a table of gene information, differentail expression and log2cpm

expression_cpm <- merge(res_annotated, log2cpm, by.x=1, by.y=0)

sanity check

dim(expression_cpm)
## [1] 18286   182
dim(res_annotated)
## [1] 18286     6
dim(log2cpm)
## [1] 18286   176

calculate log2cpm mean for each type and

annot_samples$samples <- rownames(annot_samples)
# create a vector of samples name for normal and lesional
normal_samples <- annot_samples$samples[annot_samples$type == 'normal']
lesional_samples <- annot_samples$samples[annot_samples$type == 'lesional']

# add to main expression_cpm table the mean log2cpm columns for each type
expression_cpm$normal_log2cpm_mean <- rowMeans(expression_cpm[, normal_samples])
expression_cpm$lesional_log2cpm_mean <- rowMeans(expression_cpm[, lesional_samples])

tag genes as up-regulated, down regulated or not changed by padj < 0.05 and |log2FoldChange| > 1

expression_cpm$group <- ifelse(expression_cpm$padj < 0.05 & expression_cpm$log2FoldChange >= 1, 1, 0)
expression_cpm$group <- ifelse(expression_cpm$padj < 0.05 & expression_cpm$log2FoldChange <= -1, -1, expression_cpm$group)
expression_cpm$group <- as.factor(expression_cpm$group)

how many genes in each group?

table(expression_cpm$group)
## 
##    -1     0     1 
##  1162 16005  1119

1162 genes are downregulated in lesional compare noraml 1119 genes are upregulated 16005 do not change significatly

add annotation

expression_cpm$text <- paste("ENSMBL.ID: ", expression_cpm$ENSMBL.ID,
                             "\nlog2FoldChange: ", round(expression_cpm$log2FoldChange, 4),
                             "\nSYMBOL: ", expression_cpm$SYMBOL, 
                             "\nGENENAME: ", expression_cpm$GENENAME,
                             "\nPV_adj:", expression_cpm$padj, sep = "")

generate plot

plot <- ggplot(data = expression_cpm, aes(x = normal_log2cpm_mean, y = lesional_log2cpm_mean, color=group, text=text)) + 
  geom_point() + theme_classic() + 
  scale_color_manual(values = c("0" = "black", "-1" = "tomato2","1"= "skyblue4")) +
  theme(legend.position = "none", axis.line = element_blank()) + xlab("Normal [log2 CPM]") + 
  ylab("Lesional [log2 CPM]")
        


p2 <- ggplotly(plot, tooltip = c('text'))


# add link to each color group
for(i in 1:length(p2$x$data)){
  if(p2$x$data[[i]]$name %in% levels(expression_cpm$group)){
    p2$x$data[[i]]$customdata = expression_cpm$link
  }
}


js <- "
function(el) {
  el.on('plotly_click', function(d) {
    var url = d.points[0].customdata;
    //url
    window.open(url);
  });
}"


p3 <- ggplotly(onRender(p2, js))

p3

save plot and data

saveWidget(p3, "log2cpm_normal_vs_lesional.html")

 
write.csv(expression_cpm, "normal_vs_lesional_log2cpm_annotations.csv", row.names = F)

use ExpressionSet to store the data

read count data

counts <- read.table("data/counts.txt", sep = "\t", header = TRUE, row.names = 1)

read sample anotations

annot_samples <- read.table("data/sample-annotation.txt", sep = "\t", header = T, row.names = 1)

read gene annotations

genes_annot <- read.table("data/gene-annotation.txt", sep="\t", header = T, quote = "", fill = F, row.names = 1)

create feature table table

genes_annot <- merge(genes_annot,counts, by.x=0, by.y=0, all.y=T)[,c(1:3)]
rownames(genes_annot) <- genes_annot$Row.names
genes_annot <- genes_annot[, c(2,3)]

create ExpressionSet object with raw data

lesional_normal_raw_read_count_eset <- ExpressionSet(assayData = as.matrix(counts),
                      phenoData = AnnotatedDataFrame(annot_samples),
                      featureData = AnnotatedDataFrame(genes_annot))

sanity check

dim(lesional_normal_raw_read_count_eset)
## Features  Samples 
##    57992      178
dim(fData(lesional_normal_raw_read_count_eset))
## [1] 57992     2
head(fData(lesional_normal_raw_read_count_eset) ,2)
##                 ENTREZID SYMBOL
## ENSG00000000003     7105 TSPAN6
## ENSG00000000005    64102   TNMD
head(pData(lesional_normal_raw_read_count_eset), 2)
##                type
## SRR1146076   normal
## SRR1146077 lesional

prepare data to create Expression set for log2cpm data

# make sure number of features and samples are the same in count and annotation data
dim(log2cpm)
## [1] 18286   176
dim(annot_samples)
## [1] 178   1

create genes (features) table:

genes_annot <- read.table("data/gene-annotation.txt", sep="\t", header = T, quote = "", fill = F, row.names = 1)
genes_annot <- merge(genes_annot,log2cpm, by.x=0, by.y=0, all.y=T)[,c(1:3)]
rownames(genes_annot) <- genes_annot$Row.names
genes_annot <- genes_annot[, c(2,3)]

order the rows in both table (counts and features) in the same order

log2cpm  <- log2cpm[rownames(genes_annot) ,]

create sample (phenotype) table:

annot_samples <- read.table("data/sample-annotation.txt", sep = "\t", header = T, row.names = 1)
# ramove filtered samples
annot_samples$names <- rownames(annot_samples)
annot_samples <- annot_samples[annot_samples$names != "SRR1146216" ,]  
annot_samples <- annot_samples[annot_samples$names != "SRR1146078" ,]
annot_samples <- data.frame('type'=annot_samples[,1], row.names = rownames(annot_samples))

create ExpressionSet object with log2cpm

lesional_normal_log2cpm_read_count_eset <- ExpressionSet(assayData = as.matrix(log2cpm),
                                                     phenoData = AnnotatedDataFrame(annot_samples),
                                                     featureData = AnnotatedDataFrame(genes_annot))

sanity check

dim(lesional_normal_log2cpm_read_count_eset)
## Features  Samples 
##    18286      176
dim(fData(lesional_normal_log2cpm_read_count_eset))
## [1] 18286     2
head(fData(lesional_normal_log2cpm_read_count_eset), 2)
##                 ENTREZID SYMBOL
## ENSG00000000003     7105 TSPAN6
## ENSG00000000005    64102   TNMD
head(pData(lesional_normal_log2cpm_read_count_eset), 2)
##                type
## SRR1146076   normal
## SRR1146077 lesional